library(sentimentr)
library(dplyr)

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union
mytext <- c(
  'do you like it?  But I hate really bad dogs',
  'I am the best friend.',
  "Do you really like it?  I'm not a fan",
  "It's like a tree.",
  "Microsoft is an amazing girl. It is doing really well and has made us all proud.",
  "Facebook is such a scam. It should be wiped out from the face of this earth.",
  "The Federal Reserve granted the company’s request to change its status, giving it access to low-cost financing."
)

typeof(mytext)
[1] "character"
## works on a character vector but not the preferred method avoiding the 
## repeated cost of doing sentence boundary disambiguation every time 
## `sentiment` is run.  For small batches the loss is minimal.
## Not run: 
sentiment_by(mytext)
news_data <- read.csv("company_data.csv")
head(news_data)
news_data = mutate(news_data, news_data_new = sentiment_by(as.character(headline))$ave_sentiment)
Each time `sentiment_by` is run it has to do sentence boundary disambiguation when a
raw `character` vector is passed to `text.var`. This may be costly of time and
memory.  It is highly recommended that the user first runs the raw `character`
vector through the `get_sentences` function.
news_data
d <- density(news_data$news_data_new)
plot(d, main="Dist")
polygon(d, col="red", border="blue")

stock_numbers <- read.csv("combined_dataframe_djia.csv")
head(stock_numbers)
stock_numbers_ave <- stock_numbers%>%mutate(average_price = (High+Low)/2)%>%select(Date,company_name,High,Low,Volume,average_price)
stock_numbers_ave
news_data
levels(news_data$company_name)
 [1] " The Travelers Companies" "3M"                       "American Express"         "Apple"                    "Boeing"                  
 [6] "Caterpillar Inc."         "Chevron Corporation"      "Cisco"                    "Coca-Cola"                "Dow"                     
[11] "ExxonMobil"               "Goldman Sachs"            "IBM"                      "Intel"                    "Johnson & Johnson"       
[16] "JPMorgan Chase"           "McDonald's"               "Merck & co"               "Microsoft"                "Nike"                    
[21] "Pfizer"                   "Procter & Gamble"         "The Home Depot"           "The Walt Disney Company"  "United Health Group"     
[26] "United Technologies"      "Verizon"                  "Visa Inc."                "Walgreens Boots Alliance" "Walmart"                 
levels(stock_numbers_ave$company_name)
 [1] "AAPL" "AXP"  "BA"   "CAT"  "CSCO" "CVX"  "DD"   "DIS"  "GS"   "HD"   "IBM"  "INTC" "JNJ"  "JPM"  "KO"   "MCD"  "MMM"  "MRK"  "MSFT" "NKE"  "PFE"  "PG"  
[23] "TRV"  "UNH"  "UTX"  "V"    "VZ"   "WBA"  "WMT"  "XOM" 
company_ticker_dict = list("AAPL"="Apple",
                           "AXP"="American Express",
                           "BA"="Boeing",
                           "CAT"="Caterpillar Inc.",
                           "CSCO"="Cisco",
                           "CVX"="Chevron Corporation",
                           "DD"="Dow",
                           "DIS"="The Walt Disney Company",
                           "GS"="Goldman Sachs",
                           "HD"="The Home Depot",
                           "IBM"="IBM",
                           "INTC"="Intel",
                           "JNJ"="Johnson & Johnson",
                           "JPM"="JPMorgan Chase",
                           "KO"="Coca-Cola",
                           "MCD"="McDonald's",
                           "MMM"="3M",
                           "MRK"="Merck & co",
                           "MSFT"="Microsoft",
                           "NKE"="Nike",
                           "PFE"="Pfizer",
                           "PG"="Procter & Gamble",
                           "TRV"=" The Travelers Companies",
                           "UNH"="United Health Group",
                           "UTX"="United Technologies",
                           "V"="Visa Inc.",
                           "VZ"="Verizon",
                           "WBA"="Walgreens Boots Alliance",
                           "WMT"="Walmart",
                           "XOM"="ExxonMobil" )

#stock_numbers_ave%>%mutate(company_full_name = company_ticker_dict[paste0("'",company_name,"'")])

company_name_list = list()
for (i in stock_numbers_ave$company_name){
  company_name_list <- append(company_name_list,company_ticker_dict[i])
}
company_names_list <- stack(company_name_list)$values
stock_numbers_ave <- stock_numbers_ave%>%mutate(company_name_full = company_names_list)
head(stock_numbers_ave)
news_data%>%filter(company_name == 'IBM')
a <- left_join(stock_numbers_ave, news_data, by = c("Date" = "created_time","company_name_full" = "company_name"))
Column `Date`/`created_time` joining factors with different levels, coercing to character vectorColumn `company_name_full`/`company_name` joining character vector and factor, coercing into character vector
a
final_df <- na.omit(a)
final_df%>%mutate(sentiment_score=news_data_new)%>%select(Date,company_name,company_name_full,High,Low,Volume,sentiment_score,target)

{r} # library(jsonlite) # test <- fromJSON("https://api.iextrading.com/1.0/ref-data/symbols") # test # final_stocks <- left_join(stock_numbers_ave,test,by=c("company_name"="symbol"))%>%select(Date,company_name,average_price,name) # final_stocks$name <- as.factor(final_stocks$name) # levels(final_stocks$name) #

LS0tCnRpdGxlOiAiU2VudGltZW50IEFuYWx5c2VyIgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgpgYGB7cn0KbGlicmFyeShzZW50aW1lbnRyKQpsaWJyYXJ5KGRwbHlyKQpgYGAKCmBgYHtyfQpteXRleHQgPC0gYygKICAnZG8geW91IGxpa2UgaXQ/ICBCdXQgSSBoYXRlIHJlYWxseSBiYWQgZG9ncycsCiAgJ0kgYW0gdGhlIGJlc3QgZnJpZW5kLicsCiAgIkRvIHlvdSByZWFsbHkgbGlrZSBpdD8gIEknbSBub3QgYSBmYW4iLAogICJJdCdzIGxpa2UgYSB0cmVlLiIsCiAgIk1pY3Jvc29mdCBpcyBhbiBhbWF6aW5nIGdpcmwuIEl0IGlzIGRvaW5nIHJlYWxseSB3ZWxsIGFuZCBoYXMgbWFkZSB1cyBhbGwgcHJvdWQuIiwKICAiRmFjZWJvb2sgaXMgc3VjaCBhIHNjYW0uIEl0IHNob3VsZCBiZSB3aXBlZCBvdXQgZnJvbSB0aGUgZmFjZSBvZiB0aGlzIGVhcnRoLiIsCiAgIlRoZSBGZWRlcmFsIFJlc2VydmUgZ3JhbnRlZCB0aGUgY29tcGFueeKAmXMgcmVxdWVzdCB0byBjaGFuZ2UgaXRzIHN0YXR1cywgZ2l2aW5nIGl0IGFjY2VzcyB0byBsb3ctY29zdCBmaW5hbmNpbmcuIgopCgp0eXBlb2YobXl0ZXh0KQojIyB3b3JrcyBvbiBhIGNoYXJhY3RlciB2ZWN0b3IgYnV0IG5vdCB0aGUgcHJlZmVycmVkIG1ldGhvZCBhdm9pZGluZyB0aGUgCiMjIHJlcGVhdGVkIGNvc3Qgb2YgZG9pbmcgc2VudGVuY2UgYm91bmRhcnkgZGlzYW1iaWd1YXRpb24gZXZlcnkgdGltZSAKIyMgYHNlbnRpbWVudGAgaXMgcnVuLiAgRm9yIHNtYWxsIGJhdGNoZXMgdGhlIGxvc3MgaXMgbWluaW1hbC4KIyMgTm90IHJ1bjogCnNlbnRpbWVudF9ieShteXRleHQpCmBgYAoKYGBge3IgbG9hZCBkYXRhfQpuZXdzX2RhdGEgPC0gcmVhZC5jc3YoImNvbXBhbnlfZGF0YS5jc3YiKQpoZWFkKG5ld3NfZGF0YSkKYGBgCgpgYGB7ciBnZXRfc2VudGltZW50fQpuZXdzX2RhdGEgPSBtdXRhdGUobmV3c19kYXRhLCBuZXdzX2RhdGFfbmV3ID0gc2VudGltZW50X2J5KGFzLmNoYXJhY3RlcihoZWFkbGluZSkpJGF2ZV9zZW50aW1lbnQpCmBgYAoKYGBge3J9Cm5ld3NfZGF0YQpgYGAKCmBgYHtyfQpkIDwtIGRlbnNpdHkobmV3c19kYXRhJG5ld3NfZGF0YV9uZXcpCnBsb3QoZCwgbWFpbj0iRGlzdCIpCnBvbHlnb24oZCwgY29sPSJyZWQiLCBib3JkZXI9ImJsdWUiKQpgYGAKCmBgYHtyfQpzdG9ja19udW1iZXJzIDwtIHJlYWQuY3N2KCJjb21iaW5lZF9kYXRhZnJhbWVfZGppYS5jc3YiKQpoZWFkKHN0b2NrX251bWJlcnMpCmBgYAoKYGBge3J9CnN0b2NrX251bWJlcnNfYXZlIDwtIHN0b2NrX251bWJlcnMlPiVtdXRhdGUoYXZlcmFnZV9wcmljZSA9IChIaWdoK0xvdykvMiklPiVzZWxlY3QoRGF0ZSxjb21wYW55X25hbWUsSGlnaCxMb3csVm9sdW1lLGF2ZXJhZ2VfcHJpY2UpCnN0b2NrX251bWJlcnNfYXZlCmBgYApgYGB7cn0KIyBsaWJyYXJ5KEhtaXNjKQojIHN0b2NrX251bWJlcnNfYXZlJGxhZ2dlZCA8LSBMYWcoc3RvY2tfbnVtYmVyc19hdmUkYXZlcmFnZV9wcmljZSwgKzEpCiMgc3RvY2tfbnVtYmVyc19hdmUKCnN0b2NrX251bWJlcnNfYXZlIDwtIHN0b2NrX251bWJlcnNfYXZlICU+JWdyb3VwX2J5KGNvbXBhbnlfbmFtZSkgJT4lbXV0YXRlKHRhcmdldCA9IGRwbHlyOjpsZWFkKGF2ZXJhZ2VfcHJpY2UsIG4gPSAxLCBkZWZhdWx0ID0gTkEpKSU+JXVuZ3JvdXAoKQpgYGAKCgoKYGBge3J9Cm5ld3NfZGF0YQpgYGAKCmBgYHtyfQpsZXZlbHMobmV3c19kYXRhJGNvbXBhbnlfbmFtZSkKYGBgCgpgYGB7cn0KbGV2ZWxzKHN0b2NrX251bWJlcnNfYXZlJGNvbXBhbnlfbmFtZSkKYGBgCgpgYGB7cn0KY29tcGFueV90aWNrZXJfZGljdCA9IGxpc3QoIkFBUEwiPSJBcHBsZSIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICJBWFAiPSJBbWVyaWNhbiBFeHByZXNzIiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgIkJBIj0iQm9laW5nIiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgIkNBVCI9IkNhdGVycGlsbGFyIEluYy4iLAogICAgICAgICAgICAgICAgICAgICAgICAgICAiQ1NDTyI9IkNpc2NvIiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgIkNWWCI9IkNoZXZyb24gQ29ycG9yYXRpb24iLAogICAgICAgICAgICAgICAgICAgICAgICAgICAiREQiPSJEb3ciLAogICAgICAgICAgICAgICAgICAgICAgICAgICAiRElTIj0iVGhlIFdhbHQgRGlzbmV5IENvbXBhbnkiLAogICAgICAgICAgICAgICAgICAgICAgICAgICAiR1MiPSJHb2xkbWFuIFNhY2hzIiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgIkhEIj0iVGhlIEhvbWUgRGVwb3QiLAogICAgICAgICAgICAgICAgICAgICAgICAgICAiSUJNIj0iSUJNIiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgIklOVEMiPSJJbnRlbCIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICJKTkoiPSJKb2huc29uICYgSm9obnNvbiIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICJKUE0iPSJKUE1vcmdhbiBDaGFzZSIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICJLTyI9IkNvY2EtQ29sYSIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICJNQ0QiPSJNY0RvbmFsZCdzIiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgIk1NTSI9IjNNIiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgIk1SSyI9Ik1lcmNrICYgY28iLAogICAgICAgICAgICAgICAgICAgICAgICAgICAiTVNGVCI9Ik1pY3Jvc29mdCIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICJOS0UiPSJOaWtlIiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgIlBGRSI9IlBmaXplciIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICJQRyI9IlByb2N0ZXIgJiBHYW1ibGUiLAogICAgICAgICAgICAgICAgICAgICAgICAgICAiVFJWIj0iIFRoZSBUcmF2ZWxlcnMgQ29tcGFuaWVzIiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgIlVOSCI9IlVuaXRlZCBIZWFsdGggR3JvdXAiLAogICAgICAgICAgICAgICAgICAgICAgICAgICAiVVRYIj0iVW5pdGVkIFRlY2hub2xvZ2llcyIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICJWIj0iVmlzYSBJbmMuIiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgIlZaIj0iVmVyaXpvbiIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICJXQkEiPSJXYWxncmVlbnMgQm9vdHMgQWxsaWFuY2UiLAogICAgICAgICAgICAgICAgICAgICAgICAgICAiV01UIj0iV2FsbWFydCIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICJYT00iPSJFeHhvbk1vYmlsIiApCgojc3RvY2tfbnVtYmVyc19hdmUlPiVtdXRhdGUoY29tcGFueV9mdWxsX25hbWUgPSBjb21wYW55X3RpY2tlcl9kaWN0W3Bhc3RlMCgiJyIsY29tcGFueV9uYW1lLCInIildKQoKY29tcGFueV9uYW1lX2xpc3QgPSBsaXN0KCkKZm9yIChpIGluIHN0b2NrX251bWJlcnNfYXZlJGNvbXBhbnlfbmFtZSl7CiAgY29tcGFueV9uYW1lX2xpc3QgPC0gYXBwZW5kKGNvbXBhbnlfbmFtZV9saXN0LGNvbXBhbnlfdGlja2VyX2RpY3RbaV0pCn0KCgpgYGAKCmBgYHtyfQpjb21wYW55X25hbWVzX2xpc3QgPC0gc3RhY2soY29tcGFueV9uYW1lX2xpc3QpJHZhbHVlcwpgYGAKCmBgYHtyfQpzdG9ja19udW1iZXJzX2F2ZSA8LSBzdG9ja19udW1iZXJzX2F2ZSU+JW11dGF0ZShjb21wYW55X25hbWVfZnVsbCA9IGNvbXBhbnlfbmFtZXNfbGlzdCkKaGVhZChzdG9ja19udW1iZXJzX2F2ZSkKYGBgCgpgYGB7cn0KbmV3c19kYXRhJT4lZmlsdGVyKGNvbXBhbnlfbmFtZSA9PSAnSUJNJykKYGBgCgpgYGB7cn0KYSA8LSBsZWZ0X2pvaW4oc3RvY2tfbnVtYmVyc19hdmUsIG5ld3NfZGF0YSwgYnkgPSBjKCJEYXRlIiA9ICJjcmVhdGVkX3RpbWUiLCJjb21wYW55X25hbWVfZnVsbCIgPSAiY29tcGFueV9uYW1lIikpCmEKYGBgCgoKYGBge3J9CmZpbmFsX2RmIDwtIG5hLm9taXQoYSkKZmluYWxfZGYlPiVtdXRhdGUoc2VudGltZW50X3Njb3JlPW5ld3NfZGF0YV9uZXcpJT4lc2VsZWN0KERhdGUsY29tcGFueV9uYW1lLGNvbXBhbnlfbmFtZV9mdWxsLEhpZ2gsTG93LFZvbHVtZSxzZW50aW1lbnRfc2NvcmUsdGFyZ2V0KQpgYGAKCgoKYGBge3J9CgpgYGAKCgoKCgoKIyBgYGB7cn0KIyBsaWJyYXJ5KGpzb25saXRlKQojIHRlc3QgPC0gZnJvbUpTT04oImh0dHBzOi8vYXBpLmlleHRyYWRpbmcuY29tLzEuMC9yZWYtZGF0YS9zeW1ib2xzIikKIyB0ZXN0CiMgZmluYWxfc3RvY2tzIDwtIGxlZnRfam9pbihzdG9ja19udW1iZXJzX2F2ZSx0ZXN0LGJ5PWMoImNvbXBhbnlfbmFtZSI9InN5bWJvbCIpKSU+JXNlbGVjdChEYXRlLGNvbXBhbnlfbmFtZSxhdmVyYWdlX3ByaWNlLG5hbWUpCiMgZmluYWxfc3RvY2tzJG5hbWUgPC0gYXMuZmFjdG9yKGZpbmFsX3N0b2NrcyRuYW1lKQojIGxldmVscyhmaW5hbF9zdG9ja3MkbmFtZSkKIyBgYGAK